#!/opt/lampp/bin/perl -w
use strict;
use warnings;
use DBI;
require('function.pl');

my $domain = 'ganji';
my %dirs = (
'二手' => 'secondmarket',
'全职' => 'wanted',
'兼职' => 'parttime_wanted',
'服务' => 'service_living',
'商务' => 'service_biz'
);
my $db_host = 'localhost';
my $db = 'crm';
my $db_user = 'zjxia';
my $db_password = 'bblovebb';
my $db_conn = DBI->connect("DBI:mysql:database=$db;host=$db_host", $db_user, $db_password);
$db_conn->do('set names gb2312');
my $table_tmp = 'snapshot_ganji';
my $table_online = 'snapshot';
my %cities;
load_gj_cities();
while (my ($city, $city_code) = each %cities) {
	my $city_gbk = utf8_to_gb2312($city);
	while (my ($query, $dir) = each %dirs) {
		my $dir_url = "http://wap.ganji.com/$city_code/$dir/";
		print "$dir_url\n";
		my $sub_dir_content = `wget -q $dir_url -O -`;
#		print "$sub_dir_content\n";
		while ($sub_dir_content =~ /<a href="[\w\.:\/]*\/$city_code\/(\w+)\/\?type=l">([\d\D]+?)<\/a>/g) {
			my $keyword_code = $1;
			my $keyword = $2;
			my $keyword_gbk = utf8_to_gb2312($keyword);
			my $url = "http://wap.ganji.com/$city_code/$keyword_code/?type=l";
#			print "$url\n";
#			$url = 'http://wap.ganji.com/quanzhou/zpshichangyingxiao/?type=l';
			my $list_content = `wget -q '$url' -O -`;
			while ($list_content =~ /<a href='\.\/([\w\?=&;]+?)'>\s*\d+\.([\d\D]+?)\s*<\/a>/g) {
				my $url_suffix = $1;
				my $title = $2;
#				print "$url_suffix\t$title\n";
				if ($url_suffix !~ /[\w]{5,}\?/) {
					print "skip\n";
					next;
				}
				$url = "http://wap.ganji.com/$city_code/$keyword_code/$url_suffix";
				my $detail_content = `wget -q '$url' -O -`;
#				print "step 1\n";
				my $phone_image_name = '';
				my $phone_num = '';
				my $contact_person = '';
				my $phone_digest = '';
				my $crawl_number = int(rand(10));
				if ($detail_content =~ /电话：\s*<img src="http:\/\/bj\.ganji\.com\/tel\/(\w+)\.png"/) {
					$phone_image_name = "$1.png";
					my $phone_image_url = "http://bj.ganji.com/tel/$1.png";
					`wget -q '$phone_image_url' -O images/ganji/$1.png`;
#					print "step 2\n";
				}
				if ($detail_content =~ /联系人[：\s:]*([\d\D]+?)[\s<]/) {
					$contact_person = $1;
				}
				if ($detail_content =~ /<h3>([\d\D]+?)<\/h3>/) {
					$title = $1;
				}
				if ($detail_content =~ /<a href="tel:([\d\D]+?)">/) {
					$phone_num = $1;
				}
				if ($phone_num eq '' && $detail_content =~ /联系方式:([\d\-]+)/) {
					$phone_num = $1;
				}
				if ($phone_image_name eq '' && $phone_num eq '') {
					print "no phone	$url\n";
					next;
				}
				print "$city\t$query\t$keyword\t$title\t$phone_image_name\t$phone_num\t$contact_person\t$url\n";
				my $sql = '';
				my $title_gbk = utf8_to_gb2312($title);
				my $contact_person_gbk = utf8_to_gb2312($contact_person);
				my $table = '';
				if ($phone_num ne '') {
					$sql = "select count(*) from $table_online where phone_num = '$phone_num'";
					my $request = $db_conn->prepare($sql);
					my $result = $request->execute();
					my ($count) = $request->fetchrow_array;
					if ($count > 0) {
#						print "duplicate\t$sql\n";
						next;
					}
					$table = $table_online;
				}
				elsif ($phone_image_name ne '') {
					$table = $table_tmp;
					$sql = "select count(*) from $table where phone_image_name = '$phone_image_name'";
					my $request = $db_conn->prepare($sql);
					my $result = $request->execute();
					my ($count) = $request->fetchrow_array;
					if ($count > 0) {
						print "duplicate\n";
						next;
					}
					if (length($phone_image_name) != 48) {
						print "unwanted phone image name\t$phone_image_name\n";
						next;
					}
					$table = $table_tmp;
				}
				$sql = "insert into $table(domain, city, query, title, url, phone_image_name, phone_num, contact_person, phone_digest, crawl_number) values('$domain', '$city_gbk', '$keyword_gbk', ".$db_conn->quote($title_gbk).", '$url', '$phone_image_name', '$phone_num', ".$db_conn->quote($contact_person_gbk).", '', $crawl_number)";
				$db_conn->do($sql);

			}
		}
	}
}
$db_conn->disconnect;

sub load_gj_cities {
	open IN, 'city_map_ganji' or die('fail	open city_map_ganji');
	while (<IN>) {
		chop;
		my @arr = split('	');
		if (@arr == 2) {
			$cities{$arr[0]} = $arr[1];
		}
	}
	close IN;
}
